Analyzing Main Streaming Services
DATA ANALYSIS
DATA VISUALISING
DATA CLEANSING
PYTHON
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import HTML
import plotly.offline as pyo
pyo.init_notebook_mode()
import plotly.express as px
import random
%matplotlib inline
df=pd.read_csv('moviestreams.csv')
df.drop(['Unnamed: 0'],axis=1,inplace=True)
# df.to_csv('moviestreams.csv',index=False)
df
| Title | Year | Age | IMDb | Rotten Tomatoes | Netflix | Hulu | Prime Video | Disney+ | Type | Directors | Genres | Country | Language | Runtime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Inception | 2010 | 13+ | 8.8 | 87% | 1 | 0 | 0 | 0 | 0 | Christopher Nolan | Action,Adventure,Sci-Fi,Thriller | United States,United Kingdom | English,Japanese,French | 148.0 |
| 1 | The Matrix | 1999 | 18+ | 8.7 | 87% | 1 | 0 | 0 | 0 | 0 | Lana Wachowski,Lilly Wachowski | Action,Sci-Fi | United States | English | 136.0 |
| 2 | Avengers: Infinity War | 2018 | 13+ | 8.5 | 84% | 1 | 0 | 0 | 0 | 0 | Anthony Russo,Joe Russo | Action,Adventure,Sci-Fi | United States | English | 149.0 |
| 3 | Back to the Future | 1985 | 7+ | 8.5 | 96% | 1 | 0 | 0 | 0 | 0 | Robert Zemeckis | Adventure,Comedy,Sci-Fi | United States | English | 116.0 |
| 4 | The Good, the Bad and the Ugly | 1966 | 18+ | 8.8 | 97% | 1 | 0 | 1 | 0 | 0 | Sergio Leone | Western | Italy,Spain,West Germany | Italian | 161.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16739 | The Ghosts of Buxley Hall | 1980 | NaN | 6.2 | NaN | 0 | 0 | 0 | 1 | 0 | Bruce Bilson | Comedy,Family,Fantasy,Horror | United States | English | 120.0 |
| 16740 | The Poof Point | 2001 | 7+ | 4.7 | NaN | 0 | 0 | 0 | 1 | 0 | Neal Israel | Comedy,Family,Sci-Fi | United States | English | 90.0 |
| 16741 | Sharks of Lost Island | 2013 | NaN | 5.7 | NaN | 0 | 0 | 0 | 1 | 0 | Neil Gelinas | Documentary | United States | English | NaN |
| 16742 | Man Among Cheetahs | 2017 | NaN | 6.6 | NaN | 0 | 0 | 0 | 1 | 0 | Richard Slater-Jones | Documentary | United States | English | NaN |
| 16743 | In Beaver Valley | 1950 | NaN | NaN | NaN | 0 | 0 | 0 | 1 | 0 | James Algar | Documentary,Short,Family | United States | English | 32.0 |
16744 rows × 15 columns
colors=['brown','red','orange','salmon','purple','blue','green','lightblue','lightsalmon']
df.shape
(16744, 15)
df.count()
Title 16744 Year 16744 Age 7354 IMDb 16173 Rotten Tomatoes 5158 Netflix 16744 Hulu 16744 Prime Video 16744 Disney+ 16744 Type 16744 Directors 16018 Genres 16469 Country 16309 Language 16145 Runtime 16152 dtype: int64
cols = df.columns.to_list()
cols
['Title', 'Year', 'Age', 'IMDb', 'Rotten Tomatoes', 'Netflix', 'Hulu', 'Prime Video', 'Disney+', 'Type', 'Directors', 'Genres', 'Country', 'Language', 'Runtime']
df.isna().sum()
Title 0 Year 0 Age 9390 IMDb 571 Rotten Tomatoes 11586 Netflix 0 Hulu 0 Prime Video 0 Disney+ 0 Type 0 Directors 726 Genres 275 Country 435 Language 599 Runtime 592 dtype: int64
REMOVING '+' IN AGE :
#Age={'18+':18,'7+':7,'13+':13,'all':0,'16':16}
#df.Age=df.Age.map(Age)
#df
REMOVING '%' IN ROTTEN TOMATOES
df['Rotten Tomatoes'] = df['Rotten Tomatoes'].str.replace('%', '').astype(float)
df
| Title | Year | Age | IMDb | Rotten Tomatoes | Netflix | Hulu | Prime Video | Disney+ | Type | Directors | Genres | Country | Language | Runtime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Inception | 2010 | 13+ | 8.8 | 87.0 | 1 | 0 | 0 | 0 | 0 | Christopher Nolan | Action,Adventure,Sci-Fi,Thriller | United States,United Kingdom | English,Japanese,French | 148.0 |
| 1 | The Matrix | 1999 | 18+ | 8.7 | 87.0 | 1 | 0 | 0 | 0 | 0 | Lana Wachowski,Lilly Wachowski | Action,Sci-Fi | United States | English | 136.0 |
| 2 | Avengers: Infinity War | 2018 | 13+ | 8.5 | 84.0 | 1 | 0 | 0 | 0 | 0 | Anthony Russo,Joe Russo | Action,Adventure,Sci-Fi | United States | English | 149.0 |
| 3 | Back to the Future | 1985 | 7+ | 8.5 | 96.0 | 1 | 0 | 0 | 0 | 0 | Robert Zemeckis | Adventure,Comedy,Sci-Fi | United States | English | 116.0 |
| 4 | The Good, the Bad and the Ugly | 1966 | 18+ | 8.8 | 97.0 | 1 | 0 | 1 | 0 | 0 | Sergio Leone | Western | Italy,Spain,West Germany | Italian | 161.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16739 | The Ghosts of Buxley Hall | 1980 | NaN | 6.2 | NaN | 0 | 0 | 0 | 1 | 0 | Bruce Bilson | Comedy,Family,Fantasy,Horror | United States | English | 120.0 |
| 16740 | The Poof Point | 2001 | 7+ | 4.7 | NaN | 0 | 0 | 0 | 1 | 0 | Neal Israel | Comedy,Family,Sci-Fi | United States | English | 90.0 |
| 16741 | Sharks of Lost Island | 2013 | NaN | 5.7 | NaN | 0 | 0 | 0 | 1 | 0 | Neil Gelinas | Documentary | United States | English | NaN |
| 16742 | Man Among Cheetahs | 2017 | NaN | 6.6 | NaN | 0 | 0 | 0 | 1 | 0 | Richard Slater-Jones | Documentary | United States | English | NaN |
| 16743 | In Beaver Valley | 1950 | NaN | NaN | NaN | 0 | 0 | 0 | 1 | 0 | James Algar | Documentary,Short,Family | United States | English | 32.0 |
16744 rows × 15 columns
language=pd.DataFrame(dict(df.Language.value_counts().head(10)).items(),columns=['Languages','No. Of Movies'])
fig=px.bar(language,
x=language.Languages,
y=language['No. Of Movies'],
title='Top 10 languages in Streaming Movies',
text=language['No. Of Movies'],
height=600)
fig.update_traces(texttemplate='%{text:.4s}',textposition='outside')
fig.show()
#HTML(fig.to_html())
fig=px.pie(language,names=language.Languages,values=language['No. Of Movies'],
title='Top 10 languages in Streaming Services',
height=600)
#fig.update_traces(textposition='outside')
fig.show()
#HTML(fig.to_html())
#Age Graph Functions
def making_ageGraph(df:pd.DataFrame,stream:str,height:float=600):
color=random.choice(colors)
df={'Age':df.Age.value_counts().index,'Counts':df.Age.value_counts()}
fig = px.bar(df,
x='Age',
y='Counts',
title=f"Number of Movies in specific age group in {stream} service",
text='Counts',
height=height)
fig.update_traces(marker_color=color,texttemplate='%{text:.2s}', textposition='outside') #for the text to be outside.
fig.show()
#return HTML(fig.to_html())
making_ageGraph(df,'All')
netflix_df=df[df['Netflix']==1]
making_ageGraph(netflix_df,'Netflix')
prime_df=df[df['Prime Video']==1]
making_ageGraph(prime_df,'Amazon Prime Video')
Disney_df=df[df['Disney+']==1]
making_ageGraph(Disney_df,'Disney+')
Hulu_df=df[df['Hulu']==1]
making_ageGraph(Hulu_df,'Hulu')
A Tomatometer score is calculated for a movie or TV show after it receives at least five reviews. When at least 60% of reviews for a movie or TV show are positive, a red tomato is displayed to indicate its Fresh status.
Rotten Tomatoes gives films a score out of 100 based on the averaged reviews of professional film critics. If a film gets a rating of 60 or more it gets a 'fresh' red tomato on the site. Less than 60 and it gets a rotten tomato.
fig = px.bar(df,
x=df['Rotten Tomatoes'].value_counts().index,
y=df['Rotten Tomatoes'].value_counts(),
title="Overall Rotten Tomato Ratings",
text=df['Rotten Tomatoes'].value_counts(),
height=600)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside') #for the text to be outside.
fig.show()
#HTML(fig.to_html())
rt_scores = pd.DataFrame({'Streaming Service': ["Prime Video", "Hulu","Disney+","NetFlix"],
'Rotten Tomato Score' : [netflix_df['Rotten Tomatoes'].value_counts().iloc[0],
prime_df['Rotten Tomatoes'].value_counts().iloc[0],
Disney_df['Rotten Tomatoes'].value_counts().iloc[0],
Hulu_df['Rotten Tomatoes'].value_counts().iloc[0]],
'Highest Value':[netflix_df['Rotten Tomatoes'].value_counts().index[0],
prime_df['Rotten Tomatoes'].value_counts().index[0],
Disney_df['Rotten Tomatoes'].value_counts().index[0],
Hulu_df['Rotten Tomatoes'].value_counts().index[0]]})
rt_scores.head()
| Streaming Service | Rotten Tomato Score | Highest Value | |
|---|---|---|---|
| 0 | Prime Video | 130 | 100.0 |
| 1 | Hulu | 257 | 100.0 |
| 2 | Disney+ | 19 | 100.0 |
| 3 | NetFlix | 18 | 100.0 |
rt_scores.sort_values(ascending=False, by="Rotten Tomato Score").plot(kind='bar', x='Streaming Service', y='Rotten Tomato Score',
color='Violet',
title="Streaming Service with 100% Rotten Tomato Score")
plt.show()
sorted_rt_score=rt_scores.sort_values(ascending=False, by="Rotten Tomato Score")
fig = px.bar(sorted_rt_score,
x=sorted_rt_score['Streaming Service'],
y=sorted_rt_score['Rotten Tomato Score'],
title="Rotten Tomato Ratings For Each Services",
text=sorted_rt_score['Rotten Tomato Score'],
height=600)
fig.update_traces(marker_color='purple',texttemplate='%{text:.2s}', textposition='outside') #for the text to be outside.
fig.show()
#HTML(fig.to_html())
#IMDb Graph Functions
def making_IMDbGraph(df:pd.DataFrame,stream:str,height:float=600):
color=random.choice(colors)
df=pd.DataFrame(dict(df['IMDb'].value_counts()).items(),columns=['IMDb','Counts'])
fig = px.bar(df,
x=df['IMDb'],
y=df['Counts'],
title=f"Overall IMDb Ratings For {stream} Service",
text=df['IMDb'].value_counts(),
height=height)
fig.update_traces(marker_color=color,texttemplate='%{text:.2s}', textposition='outside') #for the text to be outside.
fig.show()
#return HTML(fig.to_html())
making_IMDbGraph(df,'All')
making_IMDbGraph(netflix_df,'Netflix')
making_IMDbGraph(prime_df,'Amazon Prime')
making_IMDbGraph(Disney_df,'Diseny+')
making_IMDbGraph(Hulu_df,'Hulu')
RuntimeCount=pd.DataFrame(dict(df.Runtime.value_counts().sort_values(ascending=False)[:10]).items(),
columns=['Runtime','Counts'])
RuntimeCount
| Runtime | Counts | |
|---|---|---|
| 0 | 90.0 | 971 |
| 1 | 95.0 | 489 |
| 2 | 92.0 | 434 |
| 3 | 93.0 | 422 |
| 4 | 85.0 | 408 |
| ... | ... | ... |
| 152 | 19.0 | 8 |
| 153 | 32.0 | 8 |
| 154 | 9.0 | 8 |
| 155 | 7.0 | 8 |
| 156 | 10.0 | 8 |
157 rows × 2 columns
fig = px.bar(RuntimeCount,
x='Runtime',
y='Counts',
title="Count Of Runtimes Of Movies",
text=RuntimeCount['Runtime'],
height=600)
fig.update_traces(marker_color='purple',texttemplate='%{text:.2s}', textposition='outside')
fig.show()
#HTML(fig.to_html())
df.Directors.value_counts()
Jay Chapman 36
Joseph Kane 30
Cheh Chang 26
Sam Newfield 22
Jim Wynorski 22
..
Richard Ciupka 1
Ric Esther Bienstock 1
Ben Browder 1
Anocha Suwichakornpong 1
Richard Slater-Jones 1
Name: Directors, Length: 11338, dtype: int64
DirCount=pd.DataFrame(dict(df.Directors.value_counts()).items(),
columns=['Director','No. Of Movies'])
DirCount.sort_values(by='No. Of Movies',ascending=False,inplace=True)
DirCount=DirCount.head(20)
DirCount
| Director | No. Of Movies | |
|---|---|---|
| 0 | Jay Chapman | 36 |
| 1 | Joseph Kane | 30 |
| 2 | Cheh Chang | 26 |
| 3 | Sam Newfield | 22 |
| 4 | Jim Wynorski | 22 |
| 5 | David DeCoteau | 21 |
| 6 | William Beaudine | 21 |
| 7 | Jay Karas | 20 |
| 8 | Raúl Campos,Jan Suter | 20 |
| 9 | Marcus Raboy | 18 |
| 10 | Fred Olen Ray | 17 |
| 11 | William Witney | 17 |
| 12 | Scott L. Montoya | 17 |
| 13 | Lesley Selander | 16 |
| 14 | Mark Atkins | 16 |
| 15 | Paul Hoen | 15 |
| 16 | William Nigh | 14 |
| 20 | Philip Gardiner | 13 |
| 19 | Manny Rodriguez | 13 |
| 17 | Robert N. Bradbury | 13 |
fig = px.bar(DirCount,
x=DirCount['Director'],
y=DirCount['No. Of Movies'],
title="Directors And The Count Of Movies They Have Directed",
text=DirCount['No. Of Movies'],
height=600)
fig.update_traces(textposition='outside')
fig.show()
#HTML(fig.to_html())
def movieDirectedBy(df:pd.DataFrame,name:str):
dfn=df['Joseph Kane' == df.Directors]
dfn.fillna('null',inplace=True)
fig = px.bar(dfn,
y=dfn['IMDb'],
x=dfn['Title'],
title=f"Movies Directed By {name}",
text=dfn['Genres'],
height=600)
fig.update_traces(marker_color='salmon',textfont_size=10,textposition='inside')
fig.show()
#return HTML(fig.to_html())
movieDirectedBy(df,'Joseph Kane')
C:\Users\siddh\AppData\Local\Temp\ipykernel_6444\4066340104.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
genres=dict(df.Genres.value_counts())
gen=[]
for i in genres.keys():
i=i.split(',')
for j in i:
gen.append(j.strip())
genres_df=pd.DataFrame(dict(pd.Series(gen).value_counts()).items(),
columns=['Genres','No. Of Movies'])
genres_df
| Genres | No. Of Movies | |
|---|---|---|
| 0 | Drama | 868 |
| 1 | Comedy | 654 |
| 2 | Adventure | 560 |
| 3 | Action | 553 |
| 4 | Thriller | 467 |
| 5 | Family | 426 |
| 6 | Romance | 420 |
| 7 | Fantasy | 371 |
| 8 | Crime | 347 |
| 9 | Mystery | 318 |
| 10 | Sci-Fi | 312 |
| 11 | Horror | 296 |
| 12 | Animation | 265 |
| 13 | Documentary | 249 |
| 14 | History | 198 |
| 15 | Biography | 190 |
| 16 | Music | 171 |
| 17 | Musical | 171 |
| 18 | War | 170 |
| 19 | Western | 168 |
| 20 | Short | 141 |
| 21 | Sport | 126 |
| 22 | News | 36 |
| 23 | Film-Noir | 25 |
| 24 | Reality-TV | 8 |
| 25 | Talk-Show | 8 |
| 26 | Game-Show | 6 |
fig=px.bar(genres_df,
x=genres_df.Genres,
y=genres_df['No. Of Movies'],
title='Movies In Different Genres',
text=genres_df['No. Of Movies'],
height=600)
fig.update_traces(marker_color='brown',textfont_size=10,textposition='outside')
fig.show()
#HTML(fig.to_html())
def topMoviesIn(df:pd.DataFrame,stream:str,over:float=8.5):
color=random.choice(colors)
data= df[df['IMDb']>over]
data= data[['Title', 'IMDb', 'Genres']].sort_values(ascending=False, by='IMDb')
fig=px.bar(data,
x=data.Title,
y=data.IMDb,
title=f'Top Movies in {stream}',
text=data.Genres,
height=600)
fig.update_traces(marker_color=color,textposition='inside')
fig.show()
#return HTML(fig.to_html())
topMoviesIn(netflix_df,'Netflix')
topMoviesIn(prime_df,'Amazon Prime',8.8)
topMoviesIn(Disney_df,'Disney+',8)
topMoviesIn(Hulu_df,'Hulu',8)
def MoviesBefore(df:pd.DataFrame,stream:str,before:int=1990):
avg=df.Runtime.mean()
df=df[df.Year.astype(int)<before].nlargest(20,'IMDb','first')
color=random.choice(colors)
fig=px.bar(df,
y=df.Title,
x=df.Year,
title=f'Movies Before 1990 On {stream} Stream',
text=df.Year,
height=600)
fig.update_traces(marker_color=color,textposition='inside')
fig.show()
#return HTML(fig.to_html())
MoviesBefore(df,'All')
MoviesBefore(netflix_df,'Netflix')
MoviesBefore(prime_df,'Amazon Prime')
MoviesBefore(Disney_df,'Disney+')
MoviesBefore(Hulu_df,'Hulu')
netflix_avg = netflix_df.Runtime.mean()
prime_avg = prime_df.Runtime.mean()
Disney_avg = Disney_df.Runtime.mean()
Hulu_avg = Hulu_df.Runtime.mean()
avg = [df.Runtime.mean(),netflix_avg,prime_avg,Disney_avg,Hulu_avg]
AvgDf=pd.DataFrame({'Streaming Service':'Overall Netflix Amazon Disney+ Hulu'.split(' '),'Screen Time':avg})
AvgDf
| Streaming Service | Screen Time | |
|---|---|---|
| 0 | Overall | 93.413447 |
| 1 | Netflix | 98.912900 |
| 2 | Amazon | 92.293980 |
| 3 | Disney+ | 90.425225 |
| 4 | Hulu | 97.396610 |
fig = px.bar(AvgDf,
y=AvgDf['Streaming Service'],
x=AvgDf['Screen Time'],
title='Screen Time On Each Stream',
text=AvgDf['Screen Time'],
height=600)
fig.update_traces(texttemplate='%{text:.4s} mins',textposition='inside')
fig.show()
#HTML(fig.to_html())
!jupyter nbconvert --to html TopMovieStreaming.ipynb
[NbConvertApp] Converting notebook TopMovieStreaming.ipynb to html [NbConvertApp] Writing 82041920 bytes to TopMovieStreaming.html